Zantiks manipulation scripts

library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.0.5
library(lme4)
library(lmerTest)
library(here) # write out the path
library(kableExtra)
library(sjlabelled)
library(ggpubr)
library(rstatix)
library(car)
library(RColorBrewer)
library(nlme)

col_accent1 = "slateblue1"
col_accent2 = "springgreen3"
col_detail1 = "tomato2"

Locomotion

Data parsing

First let’s test-load one data file to see how to trim it into relevant bits. Definition in this files is so that first 4 rows and first 6 columns are redundant.

The first portion loads and parses the raw data file lines.

data_path <- here("Data/locomotion/")
data_files <- list.files(here("Data/locomotion/"), recursive = T)
data_files[1:8] # the first 8 files
## [1] "E_locomotion_traking-20000228T224844.csv"
## [2] "E_locomotion_traking-20000229T005421.csv"
## [3] "E_locomotion_traking-20000229T031017.csv"
## [4] "E_locomotion_traking-20000229T043418.csv"
## [5] "E_locomotion_traking-20000229T224459.csv"
## [6] "E_locomotion_traking-20000301T001741.csv"
## [7] "E_locomotion_traking-20000301T030703.csv"
## [8] "E_locomotion_traking-20000301T042219.csv"

Currently each file represents a set of uniquely analyzed individuals and hence no additional complications arise (like, e.g., having two files with the same individual assayed on two occasions).

In the next step we test the approach on several steps. First - we process a sample CSV file, which is unstructured and contains a lot of spurious lines and data (e.g., control variables and comments generated by the unit).

# read in one specific file for Mac users 
#dat_temp <- readLines(paste(data_path, data_files[1], sep = ''))

# # read in one specific file for PC users 
dat_temp <- readLines(paste(data_path, data_files[1], sep = '/'))

glimpse(dat_temp)
##  chr [1:11] "0.000,\"Info\",\"2000-02-28 22:48:44\",\"Service 'E_locomotion_traking' : Execution start\"" ...

The loaded data is just a vector of strings, each being a line from the original CSV file. In the next chunk we skip the first 4 lines (fixed number, lines containing technical parameters of the units), and load 5 lines (which excludes the header line - so in fact 6 lines), skipping the last technical line. At the end we clean the file (removing empty columns 2:6 and wells c(F6, F7, F8) - they are always empty in our system). Adjust as needed. The resulting file contains each assayed well as separate column, and for each there are 5 times intervals of locomotion monitoring.

dat_temp_df <- read.csv(text = dat_temp, header = T, sep = ',',
                        quote = '\"', dec = '.', skip = 4, nrows = length(dat_temp) - 4 - 2,
                        stringsAsFactors = F)

dat_temp_df <- dat_temp_df[, -(2:6)] # remove spurious columns
dat_temp_df <- dat_temp_df %>% select(!(F6:F8)) #remove wells that do not contain data
dat_temp_df
##       TIME TEMPERATURE ROUND         VARIABLE  A1   A2  A3  A4   A5   A6  A7
## 1  964.342       24.78     1 BINARENADISTANCE 0.0 12.5 2.8 1.4  3.2  8.6 4.6
## 2 1324.351       24.96     2 BINARENADISTANCE 0.0  1.4 8.8 4.6  4.4  3.0 5.8
## 3 1684.354       25.04     3 BINARENADISTANCE 1.4  1.4 5.8 9.1  6.3 12.1 0.0
## 4 2044.358       25.04     4 BINARENADISTANCE 1.4  1.6 5.8 2.8 10.7 39.9 0.0
## 5 2404.362       24.87     5 BINARENADISTANCE 1.4  2.8 8.8 2.8 12.3 15.6 1.6
##     A8   B1    B2    B3   B4   B5    B6   B7    B8    C1  C2   C3  C4  C5   C6
## 1  1.4  3.0   6.0   2.8  0.0  3.0   3.0 10.2   1.4   6.3 0.0  3.0 1.4 6.0  5.8
## 2  4.4  1.4  10.4  24.6  0.0 23.9   4.4  4.4  16.7  56.6 1.4 17.2 6.0 1.4  3.0
## 3 36.4  7.7  21.4   1.4  7.9 39.2  50.6 29.5 133.2 144.1 1.4  1.4 3.2 1.6 11.4
## 4  1.6  8.8 149.5   1.4  5.8 85.9 124.2 28.5  56.2  31.6 6.3  7.4 2.8 2.8  5.8
## 5  3.2 23.7 171.5 159.0 31.6 21.1  16.5 62.0 115.6  52.0 3.0 38.3 7.4 4.4  4.2
##      C7  C8   D1   D2  D3  D4  D5  D6   D7   D8   E1  E2   E3  E4  E5    E6
## 1   0.0 1.4  5.8  1.4 6.0 1.4 1.4 0.0  6.0  2.8  7.4 0.0 10.4 7.4 3.2   4.4
## 2   5.8 1.4  2.8  6.5 9.1 1.4 4.4 3.0  8.6  2.8  2.8 0.0 12.3 1.4 0.0   6.0
## 3  30.9 0.0  1.4  6.5 6.0 6.0 3.2 5.8  1.4  6.0  4.4 4.4  1.6 1.6 1.4   1.4
## 4  81.5 3.0 18.1 17.4 8.8 7.2 1.4 2.8 10.7  4.4 19.5 1.4  5.8 3.0 4.4  17.6
## 5 162.5 2.8 52.9 24.6 5.1 1.4 7.9 4.2 10.0 26.9  2.8 1.6 16.5 3.0 1.4 234.2
##     E7  E8   F1    F2  F3   F4  F5
## 1  4.4 2.8  7.4  26.9 2.8  6.0 5.8
## 2  2.8 5.8  1.4   4.4 1.6  3.0 7.2
## 3  1.4 4.4  5.8   3.0 3.0 13.5 2.8
## 4 32.3 1.6 13.2 295.5 3.0 11.6 5.8
## 5 14.6 6.0  2.8   6.7 6.3 17.6 6.0
glimpse(dat_temp_df)
## Rows: 5
## Columns: 49
## $ TIME        <dbl> 964.342, 1324.351, 1684.354, 2044.358, 2404.362
## $ TEMPERATURE <dbl> 24.78, 24.96, 25.04, 25.04, 24.87
## $ ROUND       <int> 1, 2, 3, 4, 5
## $ VARIABLE    <chr> "BINARENADISTANCE", "BINARENADISTANCE", "BINARENADISTANCE"…
## $ A1          <dbl> 0.0, 0.0, 1.4, 1.4, 1.4
## $ A2          <dbl> 12.5, 1.4, 1.4, 1.6, 2.8
## $ A3          <dbl> 2.8, 8.8, 5.8, 5.8, 8.8
## $ A4          <dbl> 1.4, 4.6, 9.1, 2.8, 2.8
## $ A5          <dbl> 3.2, 4.4, 6.3, 10.7, 12.3
## $ A6          <dbl> 8.6, 3.0, 12.1, 39.9, 15.6
## $ A7          <dbl> 4.6, 5.8, 0.0, 0.0, 1.6
## $ A8          <dbl> 1.4, 4.4, 36.4, 1.6, 3.2
## $ B1          <dbl> 3.0, 1.4, 7.7, 8.8, 23.7
## $ B2          <dbl> 6.0, 10.4, 21.4, 149.5, 171.5
## $ B3          <dbl> 2.8, 24.6, 1.4, 1.4, 159.0
## $ B4          <dbl> 0.0, 0.0, 7.9, 5.8, 31.6
## $ B5          <dbl> 3.0, 23.9, 39.2, 85.9, 21.1
## $ B6          <dbl> 3.0, 4.4, 50.6, 124.2, 16.5
## $ B7          <dbl> 10.2, 4.4, 29.5, 28.5, 62.0
## $ B8          <dbl> 1.4, 16.7, 133.2, 56.2, 115.6
## $ C1          <dbl> 6.3, 56.6, 144.1, 31.6, 52.0
## $ C2          <dbl> 0.0, 1.4, 1.4, 6.3, 3.0
## $ C3          <dbl> 3.0, 17.2, 1.4, 7.4, 38.3
## $ C4          <dbl> 1.4, 6.0, 3.2, 2.8, 7.4
## $ C5          <dbl> 6.0, 1.4, 1.6, 2.8, 4.4
## $ C6          <dbl> 5.8, 3.0, 11.4, 5.8, 4.2
## $ C7          <dbl> 0.0, 5.8, 30.9, 81.5, 162.5
## $ C8          <dbl> 1.4, 1.4, 0.0, 3.0, 2.8
## $ D1          <dbl> 5.8, 2.8, 1.4, 18.1, 52.9
## $ D2          <dbl> 1.4, 6.5, 6.5, 17.4, 24.6
## $ D3          <dbl> 6.0, 9.1, 6.0, 8.8, 5.1
## $ D4          <dbl> 1.4, 1.4, 6.0, 7.2, 1.4
## $ D5          <dbl> 1.4, 4.4, 3.2, 1.4, 7.9
## $ D6          <dbl> 0.0, 3.0, 5.8, 2.8, 4.2
## $ D7          <dbl> 6.0, 8.6, 1.4, 10.7, 10.0
## $ D8          <dbl> 2.8, 2.8, 6.0, 4.4, 26.9
## $ E1          <dbl> 7.4, 2.8, 4.4, 19.5, 2.8
## $ E2          <dbl> 0.0, 0.0, 4.4, 1.4, 1.6
## $ E3          <dbl> 10.4, 12.3, 1.6, 5.8, 16.5
## $ E4          <dbl> 7.4, 1.4, 1.6, 3.0, 3.0
## $ E5          <dbl> 3.2, 0.0, 1.4, 4.4, 1.4
## $ E6          <dbl> 4.4, 6.0, 1.4, 17.6, 234.2
## $ E7          <dbl> 4.4, 2.8, 1.4, 32.3, 14.6
## $ E8          <dbl> 2.8, 5.8, 4.4, 1.6, 6.0
## $ F1          <dbl> 7.4, 1.4, 5.8, 13.2, 2.8
## $ F2          <dbl> 26.9, 4.4, 3.0, 295.5, 6.7
## $ F3          <dbl> 2.8, 1.6, 3.0, 3.0, 6.3
## $ F4          <dbl> 6.0, 3.0, 13.5, 11.6, 17.6
## $ F5          <dbl> 5.8, 7.2, 2.8, 5.8, 6.0

Here we extract and append the run (subject) ID.

head_temp <- readLines(paste(data_path, data_files[1], sep = ''), n = 4)

# using RE - this method is more flexible as the structure and location of ID can change

# both of the below definitions will work, but second is more precise
# it uses the exact format of the run ID

id_index <- grep('Subject Identification', head_temp)
id_index <- grep('.*Subject Identification\\\",\"([A-Z0-9]{7}_B_[0-9]{6}_[0-9]{1,2})\\\"', head_temp)
run_id <- gsub('.*Subject Identification\\\",\"([A-Z0-9]{7}_B_[0-9]{6}_[0-9]{1,2})\\\"', '\\1', head_temp[id_index])
dat_temp_df$Datafile_ID <- run_id

assay_date <- str_sub(run_id, 11, 16)
dat_temp_df$date <- as.Date(assay_date, format = "%y%m%d")
glimpse(dat_temp_df)
## Rows: 5
## Columns: 51
## $ TIME        <dbl> 964.342, 1324.351, 1684.354, 2044.358, 2404.362
## $ TEMPERATURE <dbl> 24.78, 24.96, 25.04, 25.04, 24.87
## $ ROUND       <int> 1, 2, 3, 4, 5
## $ VARIABLE    <chr> "BINARENADISTANCE", "BINARENADISTANCE", "BINARENADISTANCE"…
## $ A1          <dbl> 0.0, 0.0, 1.4, 1.4, 1.4
## $ A2          <dbl> 12.5, 1.4, 1.4, 1.6, 2.8
## $ A3          <dbl> 2.8, 8.8, 5.8, 5.8, 8.8
## $ A4          <dbl> 1.4, 4.6, 9.1, 2.8, 2.8
## $ A5          <dbl> 3.2, 4.4, 6.3, 10.7, 12.3
## $ A6          <dbl> 8.6, 3.0, 12.1, 39.9, 15.6
## $ A7          <dbl> 4.6, 5.8, 0.0, 0.0, 1.6
## $ A8          <dbl> 1.4, 4.4, 36.4, 1.6, 3.2
## $ B1          <dbl> 3.0, 1.4, 7.7, 8.8, 23.7
## $ B2          <dbl> 6.0, 10.4, 21.4, 149.5, 171.5
## $ B3          <dbl> 2.8, 24.6, 1.4, 1.4, 159.0
## $ B4          <dbl> 0.0, 0.0, 7.9, 5.8, 31.6
## $ B5          <dbl> 3.0, 23.9, 39.2, 85.9, 21.1
## $ B6          <dbl> 3.0, 4.4, 50.6, 124.2, 16.5
## $ B7          <dbl> 10.2, 4.4, 29.5, 28.5, 62.0
## $ B8          <dbl> 1.4, 16.7, 133.2, 56.2, 115.6
## $ C1          <dbl> 6.3, 56.6, 144.1, 31.6, 52.0
## $ C2          <dbl> 0.0, 1.4, 1.4, 6.3, 3.0
## $ C3          <dbl> 3.0, 17.2, 1.4, 7.4, 38.3
## $ C4          <dbl> 1.4, 6.0, 3.2, 2.8, 7.4
## $ C5          <dbl> 6.0, 1.4, 1.6, 2.8, 4.4
## $ C6          <dbl> 5.8, 3.0, 11.4, 5.8, 4.2
## $ C7          <dbl> 0.0, 5.8, 30.9, 81.5, 162.5
## $ C8          <dbl> 1.4, 1.4, 0.0, 3.0, 2.8
## $ D1          <dbl> 5.8, 2.8, 1.4, 18.1, 52.9
## $ D2          <dbl> 1.4, 6.5, 6.5, 17.4, 24.6
## $ D3          <dbl> 6.0, 9.1, 6.0, 8.8, 5.1
## $ D4          <dbl> 1.4, 1.4, 6.0, 7.2, 1.4
## $ D5          <dbl> 1.4, 4.4, 3.2, 1.4, 7.9
## $ D6          <dbl> 0.0, 3.0, 5.8, 2.8, 4.2
## $ D7          <dbl> 6.0, 8.6, 1.4, 10.7, 10.0
## $ D8          <dbl> 2.8, 2.8, 6.0, 4.4, 26.9
## $ E1          <dbl> 7.4, 2.8, 4.4, 19.5, 2.8
## $ E2          <dbl> 0.0, 0.0, 4.4, 1.4, 1.6
## $ E3          <dbl> 10.4, 12.3, 1.6, 5.8, 16.5
## $ E4          <dbl> 7.4, 1.4, 1.6, 3.0, 3.0
## $ E5          <dbl> 3.2, 0.0, 1.4, 4.4, 1.4
## $ E6          <dbl> 4.4, 6.0, 1.4, 17.6, 234.2
## $ E7          <dbl> 4.4, 2.8, 1.4, 32.3, 14.6
## $ E8          <dbl> 2.8, 5.8, 4.4, 1.6, 6.0
## $ F1          <dbl> 7.4, 1.4, 5.8, 13.2, 2.8
## $ F2          <dbl> 26.9, 4.4, 3.0, 295.5, 6.7
## $ F3          <dbl> 2.8, 1.6, 3.0, 3.0, 6.3
## $ F4          <dbl> 6.0, 3.0, 13.5, 11.6, 17.6
## $ F5          <dbl> 5.8, 7.2, 2.8, 5.8, 6.0
## $ Datafile_ID <chr> "Z0107D3_B_210421_1", "Z0107D3_B_210421_1", "Z0107D3_B_210…
## $ date        <date> 2021-04-21, 2021-04-21, 2021-04-21, 2021-04-21, 2021-04-21

Change to long format. Now each well x time combination has it’s separate row, and since we have datafile ID we can link wells to specific individuals - e.g., to link-in sex information. We also rename variable according to convention: categorical variables - Names_start_with_upper_case; continuous variables - all_lower_case.

dat_temp_df <-
  dat_temp_df %>%
  pivot_longer(names_to = 'well_id', values_to = 'arena_distance', cols = matches('[A-H][1-9]')) %>%
  rename(time = TIME, temperature = TEMPERATURE, round = ROUND,
         Variable = VARIABLE, Date = date, Well_ID = well_id)

head(dat_temp_df)
## # A tibble: 6 × 8
##    time temperature round Variable Datafile_ID Date       Well_ID arena_distance
##   <dbl>       <dbl> <int> <chr>    <chr>       <date>     <chr>            <dbl>
## 1  964.        24.8     1 BINAREN… Z0107D3_B_… 2021-04-21 A1                 0  
## 2  964.        24.8     1 BINAREN… Z0107D3_B_… 2021-04-21 A2                12.5
## 3  964.        24.8     1 BINAREN… Z0107D3_B_… 2021-04-21 A3                 2.8
## 4  964.        24.8     1 BINAREN… Z0107D3_B_… 2021-04-21 A4                 1.4
## 5  964.        24.8     1 BINAREN… Z0107D3_B_… 2021-04-21 A5                 3.2
## 6  964.        24.8     1 BINAREN… Z0107D3_B_… 2021-04-21 A6                 8.6

We have to add the Individuals_ID variable to be able to link locomotion data to sex data. It will serve to link Datafile_ID with respective Individuals_ID - and through it with appropriate well number. The run register contains several variables used to group individuals into several blocks that may share some of the systematic variation.

run_meta <- read_delim(here('Data', 'run_data', 'ID_metadata.csv'), delim = ';')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ID_name = col_character(),
##   ID_explanation = col_character()
## )
run_meta %>%
  kbl() %>% kable_material(c('striped'))
ID_name ID_explanation
Individuals_ID Batch of individuals sexed together on one 96-well plate; used to link sex information with individual entries in data files; = group of individuals tested for multiple assays (each individual goes through each of 3 assays)
Batch_ID Batch of assays performed at a given time (i.e., a set of tests started on one occassion); number of batched with given ID must <= number of units available; usually locomotion and 3 y-maze tests would be run together, and 1 habituation test separately (hence giving rise to two batches)
Exp_block Set of plates tested together (i.e., sexed together and handled together during one session (e.g., before noon or after noon) of testing; each exp_block contains 2x45 groups identified by Individuals_ID
run_register <- read_delim(here('Data', 'run_data', 'run_register_pilot2.csv'), delim = ';')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   No = col_double(),
##   Date = col_character(),
##   Operator = col_character(),
##   Unit_ID = col_character(),
##   Experiment_zanscript = col_character(),
##   Data_file_produced_before_namechange = col_logical(),
##   Individuals_ID = col_character(),
##   Batch_ID = col_character(),
##   Exp_block = col_character(),
##   Plate = col_character(),
##   Datafile_ID = col_character(),
##   Time_raw = col_character(),
##   Time_corr = col_character(),
##   Sequence = col_double(),
##   Daytime = col_double()
## )
glimpse(run_register)
## Rows: 40
## Columns: 15
## $ No                                   <dbl> 4, 1, 2, 3, 5, 6, 7, 9, 8, 10, 14…
## $ Date                                 <chr> "22/4/2021", "22/4/2021", "22/4/2…
## $ Operator                             <chr> "ELM/Sammy", "ELM/Sammy", "ELM/Sa…
## $ Unit_ID                              <chr> "Z0107D3", "Z5B0E17", "Z8E63AA", …
## $ Experiment_zanscript                 <chr> "E_locomotion_tracking.zs", "E_ym…
## $ Data_file_produced_before_namechange <lgl> NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Individuals_ID                       <chr> "ID_210421_2", "ID_210421_1", "ID…
## $ Batch_ID                             <chr> "B_210421_1", "B_210421_1", "B_21…
## $ Exp_block                            <chr> "B_210421_1", "B_210421_1", "B_21…
## $ Plate                                <chr> NA, "YMAZE_plate1", "YMAZE_plate2…
## $ Datafile_ID                          <chr> "Z0107D3_B_210421_1", "Z5B0E17_B_…
## $ Time_raw                             <chr> "28/2/2000 22:48", "21/4/2021 23:…
## $ Time_corr                            <chr> "21/4/2021 22:48", "21/4/2021 23:…
## $ Sequence                             <dbl> 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, …
## $ Daytime                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, …
dat_temp_df <- dat_temp_df %>%
  left_join(select(run_register, Datafile_ID, Individuals_ID, Exp_block, Batch_ID, Sequence))
## Joining, by = "Datafile_ID"
dat_temp_df <- dat_temp_df %>%
  mutate(Individuals_ID_well = paste0(Individuals_ID, '_', Well_ID))

dat_temp_df <- dat_temp_df %>% select(Individuals_ID_well, Individuals_ID, Datafile_ID, Date,
                                      Batch_ID, Sequence, Exp_block,temperature, round, arena_distance)
glimpse(dat_temp_df)
## Rows: 225
## Columns: 10
## $ Individuals_ID_well <chr> "ID_210421_2_A1", "ID_210421_2_A2", "ID_210421_2_A…
## $ Individuals_ID      <chr> "ID_210421_2", "ID_210421_2", "ID_210421_2", "ID_2…
## $ Datafile_ID         <chr> "Z0107D3_B_210421_1", "Z0107D3_B_210421_1", "Z0107…
## $ Date                <date> 2021-04-21, 2021-04-21, 2021-04-21, 2021-04-21, 2…
## $ Batch_ID            <chr> "B_210421_1", "B_210421_1", "B_210421_1", "B_21042…
## $ Sequence            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Exp_block           <chr> "B_210421_1", "B_210421_1", "B_210421_1", "B_21042…
## $ temperature         <dbl> 24.78, 24.78, 24.78, 24.78, 24.78, 24.78, 24.78, 2…
## $ round               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arena_distance      <dbl> 0.0, 12.5, 2.8, 1.4, 3.2, 8.6, 4.6, 1.4, 3.0, 6.0,…

Writing a function and using map to apply data loading and wrangling to all files. Note: the function uses the pattern found in text files so an important point is:do not re-save Zantiks CSV files with any external software (be it MS Excel, Numbers, Google Sheets, etc.) as it will mess with the formatting and the way text strings are coded. Adjust REs accordingly to adapt it to your file structure whenever needed.

data_compiler <- function(filename, data_path, run_register) {
  
  ## ARG filename vector or list of file names to process (only files, no full paths)
  ## ARG data_path text string with the path to access all files
  ## ARG run_register database of all runs with datafile IDs and sexing IDs
  
  ## ARGS to develop: passing custom RE, custom wells to skip, custom columns to skip
  
  dat_temp <- readLines(paste(data_path, filename, sep = ''))
  
  # file parsing
  dat_temp_df <- read.csv(text = dat_temp, header = T, sep = ',',
                          quote = '\"', dec = '.', skip = 4, nrows = length(dat_temp) - 4 - 2,
                          stringsAsFactors = F)
  
  ## these line are design specific - for now the function has close form on those
  ## possible to implement as additional argument
  
  dat_temp_df <- dat_temp_df[, -(2:6)]
  dat_temp_df <- dat_temp_df %>% select(!(F6:F8))
  
  # extract headers
  head_temp <- readLines(paste(data_path, filename, sep = ''), n = 4)
  
  # using RE - this method is more flexible as the structure and location of ID can change
  id_index <- grep('.*Subject Identification\\\",\"([A-Z0-9]{7}_B_[0-9]{6}_[0-9]{1,2})\\\"', head_temp)
  run_id <- gsub('.*Subject Identification\\\",\"([A-Z0-9]{7}_B_[0-9]{6}_[0-9]{1,2})\\\"', '\\1', head_temp[id_index])
  dat_temp_df$Datafile_ID <- run_id
  
  assay_date <- str_sub(run_id, 11, 16)
  dat_temp_df$date <- assay_date
  
  dat_temp_df <-
  dat_temp_df %>%
  pivot_longer(names_to = 'well_id', values_to = 'arena_distance', cols = matches('[A-H][1-9]')) %>%
  rename(time = TIME, temperature = TEMPERATURE, round = ROUND,
         Variable = VARIABLE, Date = date, Well_ID = well_id)
  
  dat_temp_df <- dat_temp_df %>%
    left_join(select(run_register, Datafile_ID, Individuals_ID, Exp_block, Batch_ID, Sequence))
  
  dat_temp_df <- dat_temp_df %>%
    mutate(Individuals_ID_well = paste0(Individuals_ID, '_', Well_ID))
  
  dat_temp_df$Filename <- filename
  
  dat_temp_df <- dat_temp_df %>% select(Individuals_ID_well, Individuals_ID, Datafile_ID, Date, Filename, Exp_block, Batch_ID,
                                      Sequence, temperature, round, arena_distance)
  
  return(dat_temp_df)
}

Process the files using the above function, mapping it row-wise into a new data-frame.

# Not run: test
# data_compiler(data_files[1], data_path, run_register)

data_locomotion <- map_dfr(data_files, ~ data_compiler(.x, data_path = data_path, run_register = run_register))

length(unique(data_locomotion$Datafile_ID)) # should be 8 distinct files
## [1] 8
glimpse(data_locomotion)
## Rows: 1,800
## Columns: 11
## $ Individuals_ID_well <chr> "ID_210421_2_A1", "ID_210421_2_A2", "ID_210421_2_A…
## $ Individuals_ID      <chr> "ID_210421_2", "ID_210421_2", "ID_210421_2", "ID_2…
## $ Datafile_ID         <chr> "Z0107D3_B_210421_1", "Z0107D3_B_210421_1", "Z0107…
## $ Date                <chr> "210421", "210421", "210421", "210421", "210421", …
## $ Filename            <chr> "E_locomotion_traking-20000228T224844.csv", "E_loc…
## $ Exp_block           <chr> "B_210421_1", "B_210421_1", "B_210421_1", "B_21042…
## $ Batch_ID            <chr> "B_210421_1", "B_210421_1", "B_210421_1", "B_21042…
## $ Sequence            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ temperature         <dbl> 24.78, 24.78, 24.78, 24.78, 24.78, 24.78, 24.78, 2…
## $ round               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arena_distance      <dbl> 0.0, 12.5, 2.8, 1.4, 3.2, 8.6, 4.6, 1.4, 3.0, 6.0,…

Merge with sex meta-data

Below we load the sex information (referenced to our data via the Individuals_ID data from the sex registry file) and left-join it with the locomotion data.

sex <- read_csv(here("Data", "run_data", "position_ID_sex_pilot2.csv"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Indviduals_ID = col_character(),
##   Wellplate_location = col_character(),
##   Sex = col_character()
## )
sex <- sex %>%
  mutate(Individuals_ID_well = paste0(Indviduals_ID, "_", Wellplate_location)) %>%
  select(Individuals_ID_well, Sex)

# join together

data_locomotion <- data_locomotion %>%
  left_join(sex, by = "Individuals_ID_well")

data_locomotion$Round_f <- as.factor(data_locomotion$round)

Create final dataset

data_locomotion <- data_locomotion %>%
  select(Individuals_ID_well, Individuals_ID, Datafile_ID, Date, Exp_block, Batch_ID, Sequence, arena_distance, Sex, Round_f)

data_locomotion$Sex <- as.factor(data_locomotion$Sex)
data_locomotion$Round_f <- as.factor(data_locomotion$Round_f)
head(data_locomotion)
## # A tibble: 6 × 10
##   Individuals_ID_w… Individuals_ID Datafile_ID Date  Exp_block Batch_ID Sequence
##   <chr>             <chr>          <chr>       <chr> <chr>     <chr>       <dbl>
## 1 ID_210421_2_A1    ID_210421_2    Z0107D3_B_… 2104… B_210421… B_21042…        1
## 2 ID_210421_2_A2    ID_210421_2    Z0107D3_B_… 2104… B_210421… B_21042…        1
## 3 ID_210421_2_A3    ID_210421_2    Z0107D3_B_… 2104… B_210421… B_21042…        1
## 4 ID_210421_2_A4    ID_210421_2    Z0107D3_B_… 2104… B_210421… B_21042…        1
## 5 ID_210421_2_A5    ID_210421_2    Z0107D3_B_… 2104… B_210421… B_21042…        1
## 6 ID_210421_2_A6    ID_210421_2    Z0107D3_B_… 2104… B_210421… B_21042…        1
## # … with 3 more variables: arena_distance <dbl>, Sex <fct>, Round_f <fct>

Example visualisations

Histograms present raw and log-transformed total distance data. Scatter-whisker plot shows ranges of distance data (min to max distance traveled in total) with outliers marked by points.

data_locomotion_avg <- data_locomotion %>%
  group_by(Individuals_ID_well) %>%
  summarise(avg = mean(arena_distance), Sequence = Sequence)
## `summarise()` has grouped output by 'Individuals_ID_well'. You can override
## using the `.groups` argument.
# raw data variation
plot1.1 <- data_locomotion_avg %>%
  ggplot(aes(x = avg)) +
  geom_histogram(fill = col_accent1) +
  theme_classic() + theme(text = element_text(size = 15)) +
  xlab("Mean arena distance")
plot1.1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# log scale
plot1.2 <- data_locomotion_avg %>%
  ggplot(aes(x = log(avg+1))) +
  geom_histogram(fill = col_accent1) +
  theme_classic() + theme(text = element_text(size = 15)) +
  xlab("log(Mean arena distance)") +
  ylab("Count")
plot1.2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot1.3 <- data_locomotion %>%
  ggplot(aes(x = Individuals_ID_well, y = log(arena_distance+1))) +
  geom_boxplot(width = 0, outlier.size = 0.7, col = col_accent1) +
  theme_classic() + theme(axis.text.x = element_blank()) +
  ylab("Arena distance") + xlab("Individuals")
plot1.3

plot1.4 <- data_locomotion %>%
  ggplot(aes(x = as.factor(Sequence), y = log(arena_distance+1))) +
  geom_boxplot(width = 2, outlier.size = 0.7, col = col_accent1) +
  theme_classic() +
  ylab("Arena distance") + xlab("Sequence")
plot1.4

We can also visualise sexual differences.

#violin plot of distance grouped by sex
library(ggpubr)

plot1.4 <- na.omit(data_locomotion) %>% 
  group_by(Individuals_ID_well) %>% 
  summarise(arena_distance = log(mean(arena_distance) +1), Sex = unique(Sex)) %>% 
  ggviolin(., x = "Sex", y = "arena_distance", add = c("jitter", "mean_se"),
           error.plot = "crossbar", fill = col_accent1, lwd = 0) +
  labs(y = "log(Mean arena distance)") +
  theme(text = element_text(size = 15))
plot1.4

Produce and save final plots.

# ggsave(plot1.3, filename = "./R/visualisations/FigS1.pdf", device = "pdf", scale = 0.8)
# plot_loco <- ggarrange(plot1.2, plot1.4)
# plot_loco
# ggsave(plot_loco, filename = "./R/visualisations/Fig1.pdf", device = "pdf", scale = 0.8)

Example data analysis

Simple linear model to explore underlying variation.

Below the models are fitted using the lme function (allowing for between-sex heteroscedasticity in residuals).

# model 1: variance between rounds, individuals and batches, fixed effect of sex and assay date
model1.lme <- lme(scale(log(arena_distance+1)) ~ Sex + as.factor(Date)+as.factor(Sequence),
               random = list(~1|Exp_block, ~1|Individuals_ID_well),
             data = data_locomotion, na.action = na.omit)
summary(model1.lme)
## Linear mixed-effects model fit by REML
##   Data: data_locomotion 
##        AIC      BIC    logLik
##   4265.173 4303.548 -2125.587
## 
## Random effects:
##  Formula: ~1 | Exp_block
##         (Intercept)
## StdDev:   0.4235481
## 
##  Formula: ~1 | Individuals_ID_well %in% Exp_block
##         (Intercept)  Residual
## StdDev:   0.5979514 0.6782933
## 
## Fixed effects:  scale(log(arena_distance + 1)) ~ Sex + as.factor(Date) + as.factor(Sequence) 
##                            Value Std.Error   DF   t-value p-value
## (Intercept)           -0.3935904 0.3071344 1424 -1.281492  0.2002
## SexM                   0.4683702 0.0730066  350  6.415454  0.0000
## as.factor(Date)220421  0.3961888 0.4308572    2  0.919536  0.4549
## as.factor(Sequence)2   0.0466292 0.1429587  350  0.326173  0.7445
##  Correlation: 
##                       (Intr) SexM   a.(D)2
## SexM                  -0.098              
## as.factor(Date)220421 -0.706  0.004       
## as.factor(Sequence)2  -0.110 -0.022  0.080
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.15827467 -0.54295102 -0.02761993  0.52271810  5.87105186 
## 
## Number of Observations: 1780
## Number of Groups: 
##                          Exp_block Individuals_ID_well %in% Exp_block 
##                                  4                                356
# model 2: additional variation in sex effect between individuals (i.e. sex-specific between individual variance)
model2.lme <- lme(scale(log(arena_distance+1)) ~ Sex + as.factor(Date)+as.factor(Sequence),
               random = list(~1|Exp_block, ~Sex-1|Individuals_ID_well),
             data = data_locomotion, na.action = na.omit)
summary(model2.lme)
## Linear mixed-effects model fit by REML
##   Data: data_locomotion 
##        AIC      BIC    logLik
##   4252.747 4302.086 -2117.373
## 
## Random effects:
##  Formula: ~1 | Exp_block
##         (Intercept)
## StdDev:   0.3896704
## 
##  Formula: ~Sex - 1 | Individuals_ID_well %in% Exp_block
##  Structure: General positive-definite, Log-Cholesky parametrization
##          StdDev    Corr
## SexF     0.4895261 SexF
## SexM     0.7317608 0   
## Residual 0.6782941     
## 
## Fixed effects:  scale(log(arena_distance + 1)) ~ Sex + as.factor(Date) + as.factor(Sequence) 
##                            Value Std.Error   DF   t-value p-value
## (Intercept)           -0.3474452 0.2823834 1424 -1.230402  0.2187
## SexM                   0.4714090 0.0774704  350  6.085022  0.0000
## as.factor(Date)220421  0.3204151 0.3968428    2  0.807411  0.5042
## as.factor(Sequence)2  -0.0308337 0.1346660  350 -0.228964  0.8190
##  Correlation: 
##                       (Intr) SexM   a.(D)2
## SexM                  -0.075              
## as.factor(Date)220421 -0.708  0.004       
## as.factor(Sequence)2  -0.112 -0.019  0.081
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.04560235 -0.55073622 -0.03794525  0.53694189  5.86629912 
## 
## Number of Observations: 1780
## Number of Groups: 
##                          Exp_block Individuals_ID_well %in% Exp_block 
##                                  4                                356
anova(model1.lme, model2.lme)
##            Model df      AIC      BIC    logLik   Test  L.Ratio p-value
## model1.lme     1  7 4265.173 4303.548 -2125.587                        
## model2.lme     2  9 4252.747 4302.086 -2117.373 1 vs 2 16.42666   3e-04
## model 3: testing of sex-specific heteroscedasticity of residual variance
model2.lme.h <- lme(scale(log(arena_distance+1)) ~ Sex + as.factor(Date)+as.factor(Sequence),
               random = list(~1|Exp_block, ~Sex-1|Individuals_ID_well),
               weights = varIdent(form=~1|Sex),
             data = data_locomotion, na.action = na.omit)
summary(model2.lme.h)
## Linear mixed-effects model fit by REML
##   Data: data_locomotion 
##        AIC      BIC    logLik
##   4220.467 4275.288 -2100.233
## 
## Random effects:
##  Formula: ~1 | Exp_block
##         (Intercept)
## StdDev:   0.3896384
## 
##  Formula: ~Sex - 1 | Individuals_ID_well %in% Exp_block
##  Structure: General positive-definite, Log-Cholesky parametrization
##          StdDev    Corr
## SexF     0.5065089 SexF
## SexM     0.7145517 0   
## Residual 0.6127982     
## 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | Sex 
##  Parameter estimates:
##        F        M 
## 1.000000 1.247676 
## Fixed effects:  scale(log(arena_distance + 1)) ~ Sex + as.factor(Date) + as.factor(Sequence) 
##                            Value Std.Error   DF   t-value p-value
## (Intercept)           -0.3474449 0.2823614 1424 -1.230497  0.2187
## SexM                   0.4714093 0.0774709  350  6.084988  0.0000
## as.factor(Date)220421  0.3204146 0.3968115    2  0.807473  0.5042
## as.factor(Sequence)2  -0.0308344 0.1346665  350 -0.228969  0.8190
##  Correlation: 
##                       (Intr) SexM   a.(D)2
## SexM                  -0.075              
## as.factor(Date)220421 -0.708  0.004       
## as.factor(Sequence)2  -0.112 -0.019  0.081
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.46316987 -0.57588782 -0.02410252  0.53466987  5.44967210 
## 
## Number of Observations: 1780
## Number of Groups: 
##                          Exp_block Individuals_ID_well %in% Exp_block 
##                                  4                                356
anova(model2.lme, model2.lme.h)
##              Model df      AIC      BIC    logLik   Test  L.Ratio p-value
## model2.lme       1  9 4252.747 4302.086 -2117.373                        
## model2.lme.h     2 10 4220.467 4275.288 -2100.233 1 vs 2 34.27962  <.0001

Simplified version of this model (ignoring residual heteroscedaticity and only looking at sex-specific between-individual variance) can be fitted by lmer (the lme4 package).

# model: variance between rounds, individuals and batches, fixed effect of sex and assay date
model1 <- lmer(scale(log(arena_distance+1)) ~ Sex + as.factor(Date) + (1|Individuals_ID_well) + (1|Exp_block),
             data = data_locomotion)
model1.1 <- lmer(scale(log(arena_distance+1)) ~ Sex + as.factor(Date) + (1|Exp_block),
             data = data_locomotion)
summary(model1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: scale(log(arena_distance + 1)) ~ Sex + as.factor(Date) + (1 |  
##     Individuals_ID_well) + (1 | Exp_block)
##    Data: data_locomotion
## 
## REML criterion at convergence: 4230.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1574 -0.5443 -0.0276  0.5230  5.8709 
## 
## Random effects:
##  Groups              Name        Variance Std.Dev.
##  Individuals_ID_well (Intercept) 0.3526   0.5938  
##  Exp_block           (Intercept) 0.1748   0.4181  
##  Residual                        0.4552   0.6747  
## Number of obs: 1780, groups:  Individuals_ID_well, 356; Exp_block, 4
## 
## Fixed effects:
##                        Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)            -0.36786    0.30142   2.04236  -1.220    0.344    
## SexM                    0.46640    0.07251 351.28318   6.432 4.11e-10 ***
## as.factor(Date)220421   0.38289    0.42404   1.99989   0.903    0.462    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) SexM  
## SexM        -0.102       
## a.(D)220421 -0.704  0.006
anova(model1, model1.1)
## refitting model(s) with ML (instead of REML)
## Data: data_locomotion
## Models:
## model1.1: scale(log(arena_distance + 1)) ~ Sex + as.factor(Date) + (1 | 
## model1.1:     Exp_block)
## model1: scale(log(arena_distance + 1)) ~ Sex + as.factor(Date) + (1 | 
## model1:     Individuals_ID_well) + (1 | Exp_block)
##          npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## model1.1    5 4687.3 4714.8 -2338.7   4677.3                         
## model1      6 4236.8 4269.7 -2112.4   4224.8 452.53  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qplot(residuals(model1)) + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# model: additional variation in sex effect between individuals (i.e. sex-specific between individual variance)
model2 <- lmer(scale(log(arena_distance+1)) ~ Sex + as.factor(Date) + (Sex-1|Individuals_ID_well) + (1|Exp_block),
             data = data_locomotion)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00832546 (tol = 0.002, component 1)
# marginally off convergence but let's proceed
summary(model2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: scale(log(arena_distance + 1)) ~ Sex + as.factor(Date) + (Sex -  
##     1 | Individuals_ID_well) + (1 | Exp_block)
##    Data: data_locomotion
## 
## REML criterion at convergence: 4213.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0447 -0.5517 -0.0391  0.5384  5.8664 
## 
## Random effects:
##  Groups              Name        Variance Std.Dev. Corr
##  Individuals_ID_well SexF        0.2365   0.4863       
##                      SexM        0.5273   0.7261   0.33
##  Exp_block           (Intercept) 0.1500   0.3872       
##  Residual                        0.4552   0.6747       
## Number of obs: 1780, groups:  Individuals_ID_well, 356; Exp_block, 4
## 
## Fixed effects:
##                        Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)            -0.34022    0.27885   2.02512  -1.220    0.345    
## SexM                    0.46854    0.07691 240.50191   6.092 4.39e-09 ***
## as.factor(Date)220421   0.32617    0.39307   1.99915   0.830    0.494    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) SexM  
## SexM        -0.078       
## a.(D)220421 -0.706  0.006
## optimizer (nloptwrap) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.00832546 (tol = 0.002, component 1)
summary(model2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: scale(log(arena_distance + 1)) ~ Sex + as.factor(Date) + (Sex -  
##     1 | Individuals_ID_well) + (1 | Exp_block)
##    Data: data_locomotion
## 
## REML criterion at convergence: 4213.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0447 -0.5517 -0.0391  0.5384  5.8664 
## 
## Random effects:
##  Groups              Name        Variance Std.Dev. Corr
##  Individuals_ID_well SexF        0.2365   0.4863       
##                      SexM        0.5273   0.7261   0.33
##  Exp_block           (Intercept) 0.1500   0.3872       
##  Residual                        0.4552   0.6747       
## Number of obs: 1780, groups:  Individuals_ID_well, 356; Exp_block, 4
## 
## Fixed effects:
##                        Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)            -0.34022    0.27885   2.02512  -1.220    0.345    
## SexM                    0.46854    0.07691 240.50191   6.092 4.39e-09 ***
## as.factor(Date)220421   0.32617    0.39307   1.99915   0.830    0.494    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) SexM  
## SexM        -0.078       
## a.(D)220421 -0.706  0.006
## optimizer (nloptwrap) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.00832546 (tol = 0.002, component 1)
# males tend to have larger variance

anova(model1, model2) %>% kbl() %>% kable_material(c("striped"))
## refitting model(s) with ML (instead of REML)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
model1 6 4236.800 4269.707 -2112.400 4224.800 NA NA NA
model2 8 4224.016 4267.891 -2104.008 4208.016 16.7849 2 0.0002266
# difference in sex-specific variances significant (likelihood-ratio test)

Y-maze

Data parsing

First we load the y-maze data and tidy it up to separate summary data from zone changes data.

## parse datafiles into a nested tibble
data_path <- here('Data/ymaze/')
data_files <- list.files(here('Data/ymaze/'), recursive = T)
data_files[1:8]
## [1] "E_ymaze_droso-20210421T232255.csv" "E_ymaze_droso-20210421T232340.csv"
## [3] "E_ymaze_droso-20210421T232451.csv" "E_ymaze_droso-20210422T005300.csv"
## [5] "E_ymaze_droso-20210422T005336.csv" "E_ymaze_droso-20210422T005452.csv"
## [7] "E_ymaze_droso-20210422T034214.csv" "E_ymaze_droso-20210422T034234.csv"

The below parser identifies lines in the dataset that directly relate to zone-switching data and extracts them, or (when summary = TRUE) it extracts the arena distances summaries from the bottom section of the file.

# helper functions extracting the data rows and the header rows
data_parser <- function(pattern, path, filename, summary = FALSE) {
  dat_temp <- readLines(paste(path, filename, sep = ''))
  zone_change_index <- grep(pattern, dat_temp)
  
  if(summary == FALSE) {
    return(read_delim(file = I(dat_temp[zone_change_index]), col_names = F, delim = ',',
                      quote = '\"' # skip = 4, nrows = length(dat_temp) - 4 - 1)
    ))
  } else {
    return(read_delim(file = I(dat_temp[-zone_change_index]), col_names = T, delim = ',',
                      quote = '\"', skip = 6, n_max = 3))
  }
}

# test the parser
# here we simply check that the selected pattern to be looked for (`Arena`) is indeed present in the file.
pattern1 <- "Arena" # defines what should contain each line that we look for
test1 <- readLines(paste(data_path, data_files[1], sep = ''))
id_test <- grep(pattern1, test1) # use (simplified) RE to select lines
# this extracts (messy) run and temporary data from each file
test_dat <- read_delim(file = I(test1[-id_test]), col_names = T, delim = ',',
                      quote = '\"', skip = 6, n_max = 3)
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]
# helper function that parses the (messy) run data into a readable format)
head_parser <- function(path, filename) {
  dat_temp <- readLines(paste(path, filename, sep = ''), n = 4)
  return(read_delim(file = I(dat_temp),
                    delim = ',',
                    col_names = F))
}

Below steps go through initial cleaning of data and renaming variables, as well as detailed parsing of raw Y-maze movement data.

The below code generates a nested tibble (grouped by single analysis output files, i.e., Filename).

# create tibble with raw data (nested: data and header nested under file names)
dat_raw <- tibble(data_files = data_files) %>%
  mutate(data = map(data_files, 
                    ~ data_parser(pattern1,
                                  data_path,
                                  .x, summary = FALSE))) %>%
  mutate(head = map(data_files, ~ head_parser(data_path, .x)))

# this removes the redundant 'Arena' column
dat_raw[[2]] <- dat_raw[[2]] %>%
  map(~ select(.x, !c(X3)))

# rename variables in sub-tibble based on their real content
dat_raw[[2]] <- dat_raw[[2]] %>%
  map(~ rename(.x, time = X1, Info = X2, Arena = X4, Action = X5, Zone_no = X6))

# reformat the head sub-tibble
dat_raw[[3]] <- dat_raw[[3]] %>%
  map(~ pivot_wider(.x, names_from = X3, values_from = X4)) %>%
  map(~ rename(.x, Datafile_ID = `Subject Identification`)) %>%
  map(~ select(.x, Apparatus, Datafile_ID))

# check the modifications
dat_raw[[2]][[1]]
## # A tibble: 6,119 × 5
##     time Info  Arena Action     Zone_no
##    <dbl> <chr> <dbl> <chr>        <dbl>
##  1  601. Info      1 Enter_Zone       3
##  2  601. Info      3 Enter_Zone       1
##  3  601. Info      4 Enter_Zone       1
##  4  601. Info      6 Enter_Zone       4
##  5  601. Info      7 Enter_Zone       4
##  6  601. Info      9 Enter_Zone       4
##  7  601. Info     10 Enter_Zone       1
##  8  601. Info     11 Enter_Zone       3
##  9  601. Info     12 Enter_Zone       1
## 10  601. Info     14 Enter_Zone       3
## # … with 6,109 more rows
dat_raw[[3]][[1]]
## # A tibble: 1 × 2
##   Apparatus        Datafile_ID       
##   <chr>            <chr>             
## 1 Zantiks MWP Unit Z5B0E17_B_210421_1

The below code processes arena movement summaries contained in the bottom sections of output files; this data is not further used in the current protocol.

# parse summary data from a file into a separate tibble - this is not used for now
# processed only for consistency
dat_raw_summ <- tibble(data_files = data_files) %>%
  mutate(data = map(data_files, ~ data_parser(pattern1,
                                data_path,
                                .x,
                                summary = T)))
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]

## Warning: Missing column names filled in: 'X1' [1], 'X2' [2], 'X3' [3], 'X4' [4]
# dat_raw_summ[[2]] <- dat_raw_summ[[2]] %>%
#   map(~ select(.x, !c(X1, X2, X3))) %>%
#   map(~ rename(.x, Summary_stat = X4))
# dat_raw_summ[[2]][[1]]

Unnest the tibble and select relevant variables.

dat_un <- dat_raw %>%
  unnest(head) %>%
  unnest(data)

dat_un <- dat_un %>%
  select(data_files, Datafile_ID, time, Arena, Action, Zone_no) %>%
  rename(Data_file = data_files)

Turn into wider format with enter/exit columns, add row ID to maintain order if needed.

dat_un <- dat_un %>%
  mutate(Row_ID = row_number()) %>%
  pivot_wider(names_from = Action, values_from = Zone_no) %>%
  rename(Exit_zone = Exit_Zone, Enter_zone = Enter_Zone)
dat_un
## # A tibble: 96,083 × 7
##    Data_file                 Datafile_ID  time Arena Row_ID Enter_zone Exit_zone
##    <chr>                     <chr>       <dbl> <dbl>  <int>      <dbl>     <dbl>
##  1 E_ymaze_droso-20210421T2… Z5B0E17_B_…  601.     1      1          3        NA
##  2 E_ymaze_droso-20210421T2… Z5B0E17_B_…  601.     3      2          1        NA
##  3 E_ymaze_droso-20210421T2… Z5B0E17_B_…  601.     4      3          1        NA
##  4 E_ymaze_droso-20210421T2… Z5B0E17_B_…  601.     6      4          4        NA
##  5 E_ymaze_droso-20210421T2… Z5B0E17_B_…  601.     7      5          4        NA
##  6 E_ymaze_droso-20210421T2… Z5B0E17_B_…  601.     9      6          4        NA
##  7 E_ymaze_droso-20210421T2… Z5B0E17_B_…  601.    10      7          1        NA
##  8 E_ymaze_droso-20210421T2… Z5B0E17_B_…  601.    11      8          3        NA
##  9 E_ymaze_droso-20210421T2… Z5B0E17_B_…  601.    12      9          1        NA
## 10 E_ymaze_droso-20210421T2… Z5B0E17_B_…  601.    14     10          3        NA
## # … with 96,073 more rows

Bin into 10-min intervals (if binning would be needed or only first 10 mins of exploration used). Then sort the dataset into individual arenas and chronologically within arenas, and then in enter-exit order at the lowest level.

dat_an <- dat_un
dat_an <- dat_an %>%
  mutate(Bin = ifelse(time > 600 & time < 1200, 1,
                      ifelse(time > 1200 & time < 1800, 2,
                             ifelse(time > 1800 & time < 2500, 3, NA))))

dat_an <- dat_an %>%
  arrange(Data_file, Datafile_ID, Arena, time, Exit_zone)
dat_an
## # A tibble: 96,083 × 8
##    Data_file           Datafile_ID  time Arena Row_ID Enter_zone Exit_zone   Bin
##    <chr>               <chr>       <dbl> <dbl>  <int>      <dbl>     <dbl> <dbl>
##  1 E_ymaze_droso-2021… Z5B0E17_B_…  601.     1      1          3        NA     1
##  2 E_ymaze_droso-2021… Z5B0E17_B_…  887.     1    154         NA         3     1
##  3 E_ymaze_droso-2021… Z5B0E17_B_…  887.     1    155          4        NA     1
##  4 E_ymaze_droso-2021… Z5B0E17_B_…  887.     1    156         NA         4     1
##  5 E_ymaze_droso-2021… Z5B0E17_B_…  887.     1    157          3        NA     1
##  6 E_ymaze_droso-2021… Z5B0E17_B_…  887.     1    158         NA         3     1
##  7 E_ymaze_droso-2021… Z5B0E17_B_…  887.     1    159          4        NA     1
##  8 E_ymaze_droso-2021… Z5B0E17_B_…  893.     1    162         NA         4     1
##  9 E_ymaze_droso-2021… Z5B0E17_B_…  893.     1    163          1        NA     1
## 10 E_ymaze_droso-2021… Z5B0E17_B_…  747.     2    119          4        NA     1
## # … with 96,073 more rows

Error checking - do all enter zones have a matching exit zone (if enter and exit exist)?

dat_an <- dat_an %>%
  mutate(Zone = ifelse(Enter_zone == lead(Exit_zone), Enter_zone, 666)) %>%
  mutate(t_enter = ifelse(Enter_zone >= 1, time, 666)) %>%
  mutate(t_exit = ifelse(Exit_zone >= 1, time, 666))

dat_an %>% filter(Zone == 666)
## # A tibble: 17 × 11
##    Data_file     Datafile_ID  time Arena Row_ID Enter_zone Exit_zone   Bin  Zone
##    <chr>         <chr>       <dbl> <dbl>  <int>      <dbl>     <dbl> <dbl> <dbl>
##  1 E_ymaze_dros… Z5B0E17_B_… 2393.    15  86154          2        NA     3   666
##  2 E_ymaze_dros… Z8E63AA_B_… 2092.     4  88502          3        NA     3   666
##  3 E_ymaze_dros… Z8E63AA_B_… 2401.     5  89288          2        NA     3   666
##  4 E_ymaze_dros… Z8E63AA_B_… 2236.     7  88984          1        NA     3   666
##  5 E_ymaze_dros… Z8E63AA_B_… 1647.     8  87462          2        NA     2   666
##  6 E_ymaze_dros… Z8E63AA_B_… 2167.     9  88636          1        NA     3   666
##  7 E_ymaze_dros… Z8E63AA_B_… 2337.    11  89168          1        NA     3   666
##  8 E_ymaze_dros… Z8E63AA_B_… 1423.    12  87108          3        NA     2   666
##  9 E_ymaze_dros… Z62B8E1_B_… 1791.     1  93319          4        NA     2   666
## 10 E_ymaze_dros… Z62B8E1_B_… 2376.     3  95907          3        NA     3   666
## 11 E_ymaze_dros… Z62B8E1_B_… 2034.     4  93829          4        NA     3   666
## 12 E_ymaze_dros… Z62B8E1_B_… 2402.     5  96083          3        NA     3   666
## 13 E_ymaze_dros… Z62B8E1_B_… 1882.     6  93507          2        NA     3   666
## 14 E_ymaze_dros… Z62B8E1_B_…  730.     7  89831          3        NA     1   666
## 15 E_ymaze_dros… Z62B8E1_B_… 2219.     8  95393          2        NA     3   666
## 16 E_ymaze_dros… Z62B8E1_B_… 2399.    10  96031          2        NA     3   666
## 17 E_ymaze_dros… Z62B8E1_B_… 2395.    12  96017          3        NA     3   666
## # … with 2 more variables: t_enter <dbl>, t_exit <dbl>
dat_an %>% filter(t_enter == 666)
## # A tibble: 0 × 11
## # … with 11 variables: Data_file <chr>, Datafile_ID <chr>, time <dbl>,
## #   Arena <dbl>, Row_ID <int>, Enter_zone <dbl>, Exit_zone <dbl>, Bin <dbl>,
## #   Zone <dbl>, t_enter <dbl>, t_exit <dbl>
dat_an %>% filter(t_exit == 666)
## # A tibble: 0 × 11
## # … with 11 variables: Data_file <chr>, Datafile_ID <chr>, time <dbl>,
## #   Arena <dbl>, Row_ID <int>, Enter_zone <dbl>, Exit_zone <dbl>, Bin <dbl>,
## #   Zone <dbl>, t_enter <dbl>, t_exit <dbl>
# ALL GOOD! Note - this step is very important and serves to test if data points were sorted correctly
# Single detected mistakes are likely final entries that failed to exit before assay end


# Let's filter out the non-conforming cases and re-confirm the rest is ordered correctly
# This stage is critical: dplyr filter() function is terribly unintuitive - with the logical
# condition "NOT" (!) it also drops NA values - thus we have to ensure with replace_na() they are kept
# to preserve the ordering of enter-exit events

dat_an <- dat_an %>% filter((Zone != 666) %>% replace_na(TRUE))
dat_an <- dat_an %>%
  arrange(Data_file, Datafile_ID, Arena, time, Exit_zone)
dat_an <- dat_an %>%
  mutate(Zone = ifelse(Enter_zone == lead(Exit_zone), Enter_zone, 666)) %>%
  mutate(t_enter = ifelse(Enter_zone >= 1, time, 666)) %>%
  mutate(t_exit = ifelse(Exit_zone >= 1, time, 666))
dat_an %>% filter(Zone == 666)
## # A tibble: 0 × 11
## # … with 11 variables: Data_file <chr>, Datafile_ID <chr>, time <dbl>,
## #   Arena <dbl>, Row_ID <int>, Enter_zone <dbl>, Exit_zone <dbl>, Bin <dbl>,
## #   Zone <dbl>, t_enter <dbl>, t_exit <dbl>

Simplify data by putting enter and exit data in one row.

dat_an <- dat_an %>%
  # filter(is.na(Zone)) %>% # this filter is just for testing purposes (it removes the 'Exit_zone' portion of data)
  select(Data_file, Datafile_ID, time, Arena, Row_ID, Bin, Zone, t_enter, t_exit)
dat_an <- dat_an %>%
  mutate(t_exit = lead(t_exit))
dat_an <- na.omit(dat_an)
dat_an <- dat_an %>%
  mutate(t_zone = t_exit - t_enter)
dat_an
## # A tibble: 47,860 × 10
##    Data_file   Datafile_ID  time Arena Row_ID   Bin  Zone t_enter t_exit  t_zone
##    <chr>       <chr>       <dbl> <dbl>  <int> <dbl> <dbl>   <dbl>  <dbl>   <dbl>
##  1 E_ymaze_dr… Z5B0E17_B_…  601.     1      1     1     3    601.   887. 2.86e+2
##  2 E_ymaze_dr… Z5B0E17_B_…  887.     1    155     1     4    887.   887. 3.40e-2
##  3 E_ymaze_dr… Z5B0E17_B_…  887.     1    157     1     3    887.   887. 4.33e-1
##  4 E_ymaze_dr… Z5B0E17_B_…  887.     1    159     1     4    887.   893. 5.37e+0
##  5 E_ymaze_dr… Z5B0E17_B_…  747.     2    119     1     4    747.   761. 1.39e+1
##  6 E_ymaze_dr… Z5B0E17_B_…  761.     2    121     1     2    761.   761. 1.00e-1
##  7 E_ymaze_dr… Z5B0E17_B_…  761.     2    123     1     4    761.   762. 8.01e-1
##  8 E_ymaze_dr… Z5B0E17_B_…  762.     2    125     1     2    762.   771. 9.00e+0
##  9 E_ymaze_dr… Z5B0E17_B_…  771.     2    127     1     4    771.   771. 3.33e-1
## 10 E_ymaze_dr… Z5B0E17_B_…  771.     2    129     1     2    771.   771. 3.98e-1
## # … with 47,850 more rows

Remove center zone (middle of the Y-maze). In an alternative version of analysis this can be included by considering the middle zone as equally valid spatial choice for flies. Below we simplify the analysis by considering only arm zones.

dat_an2 <- dat_an %>% filter(Zone != 4)
dat_an2
## # A tibble: 24,280 × 10
##    Data_file   Datafile_ID  time Arena Row_ID   Bin  Zone t_enter t_exit  t_zone
##    <chr>       <chr>       <dbl> <dbl>  <int> <dbl> <dbl>   <dbl>  <dbl>   <dbl>
##  1 E_ymaze_dr… Z5B0E17_B_…  601.     1      1     1     3    601.   887. 286.   
##  2 E_ymaze_dr… Z5B0E17_B_…  887.     1    157     1     3    887.   887.   0.433
##  3 E_ymaze_dr… Z5B0E17_B_…  761.     2    121     1     2    761.   761.   0.100
##  4 E_ymaze_dr… Z5B0E17_B_…  762.     2    125     1     2    762.   771.   9.00 
##  5 E_ymaze_dr… Z5B0E17_B_…  771.     2    129     1     2    771.   771.   0.398
##  6 E_ymaze_dr… Z5B0E17_B_…  779.     2    133     1     3    779.   785.   6.13 
##  7 E_ymaze_dr… Z5B0E17_B_…  791.     2    137     1     3    791.   866.  74.6  
##  8 E_ymaze_dr… Z5B0E17_B_…  877.     2    149     1     3    877.   898.  20.8  
##  9 E_ymaze_dr… Z5B0E17_B_…  908.     2    167     1     2    908.   908.   0.100
## 10 E_ymaze_dr… Z5B0E17_B_…  908.     2    171     1     2    908.   909.   0.301
## # … with 24,270 more rows

Y-maze movements analysis

We need to work separately for each individual - the easiest is to separate individuals into their own sub-tibbles and work on them by vectorising operations - to achieve this we have to identify each individual (by pasting together file name and arena).

well_arena <- read_delim(here('Data', 'run_data', 'well_arena_corresp.csv'), delim = ';')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Ymaze = col_character(),
##   Arena = col_double(),
##   Ymaze_arena = col_character(),
##   Plate48well = col_character()
## )
dat_an2 <- dat_an2 %>%
  left_join(select(run_register, Datafile_ID, Individuals_ID, Sequence, Plate, Unit_ID), by = "Datafile_ID") %>%
  mutate(Ymaze_arena = paste0(Plate, "_", Arena)) %>%
  # mutate(fly_id = paste(gsub('[A-Za-z0-9]+\\/([A-Za-z0-9_-]+)\\.csv', '\\1', data_files), arena, sep = "_"))
  # mutate(fly_id = paste0(Datafile_ID, "_", Ymaze_arena)) %>% # not really necessary
  left_join(select(well_arena, Ymaze_arena, Plate48well), by = "Ymaze_arena") %>%
  mutate(Individuals_ID_well = paste0(Individuals_ID, "_", Plate48well)) %>%
  select(Datafile_ID, Individuals_ID, Individuals_ID_well, Sequence, Ymaze_arena, time, Bin, Zone, t_zone)

# split datafile into individual flies
dat_an2 <- dat_an2 %>% split(., .[, "Individuals_ID_well"])

dat_an2 <- dat_an2 %>%
  map(~ mutate(.x, Lag_zone = lag(Zone))) %>%
  map(~ mutate(.x, Turn = case_when(Lag_zone==1 & Zone==2 ~ 'L',
                                  Lag_zone==1 & Zone==3 ~ 'R',
                                  Lag_zone==2 & Zone==1 ~ 'R',
                                  Lag_zone==2 & Zone==3 ~ 'L',
                                  Lag_zone==3 & Zone==1 ~ 'L',
                                  Lag_zone==3 & Zone==2 ~ 'R',
                                  # lag_zone==zone ~ 'X', # this enables additional decision type = stay in the given zone
                                  TRUE ~ NA_character_ ))) %>%
  map(~ select(.x, Datafile_ID, Individuals_ID, Individuals_ID_well, Sequence, Ymaze_arena, time, Bin, Zone, t_zone, Turn))

dat_an3 <- bind_rows(dat_an2)
dat_an3 <- dat_an3 %>%
  arrange(Datafile_ID, Individuals_ID_well, Bin)

dat_an4 <- na.omit(dat_an3)
dat_an4
## # A tibble: 2,745 × 10
##    Datafile_ID  Individuals_ID Individuals_ID_… Sequence Ymaze_arena  time   Bin
##    <chr>        <chr>          <chr>               <dbl> <chr>       <dbl> <dbl>
##  1 Z5B0E17_B_2… ID_210421_1    ID_210421_1_A2          1 YMAZE_plat…  779.     1
##  2 Z5B0E17_B_2… ID_210421_1    ID_210421_1_A2          1 YMAZE_plat…  908.     1
##  3 Z5B0E17_B_2… ID_210421_1    ID_210421_1_A2          1 YMAZE_plat… 1121.     1
##  4 Z5B0E17_B_2… ID_210421_1    ID_210421_1_A2          1 YMAZE_plat… 1246.     2
##  5 Z5B0E17_B_2… ID_210421_1    ID_210421_1_A2          1 YMAZE_plat… 1338.     2
##  6 Z5B0E17_B_2… ID_210421_1    ID_210421_1_A2          1 YMAZE_plat… 1418.     2
##  7 Z5B0E17_B_2… ID_210421_1    ID_210421_1_A2          1 YMAZE_plat… 1432.     2
##  8 Z5B0E17_B_2… ID_210421_1    ID_210421_1_A2          1 YMAZE_plat… 1789.     2
##  9 Z5B0E17_B_2… ID_210421_1    ID_210421_1_A2          1 YMAZE_plat… 1818.     3
## 10 Z5B0E17_B_2… ID_210421_1    ID_210421_1_A2          1 YMAZE_plat… 1976.     3
## # … with 2,735 more rows, and 3 more variables: Zone <dbl>, t_zone <dbl>,
## #   Turn <chr>

Resulting data could be further refined, e.g. by removing entries with t_zone below certain threshold (i.e., ignoring cases where a flies only very briefly entered a zone and exited right away).

There are several paradigms of anaylsing Y-maze data. One is based on tetragram analysis (i.e., sequences of 4 consecutive turns in the maze). Here we simplify it to trigram analsyis to increase the power and effective number of consecutive movement sequences.

dat_tri <- dat_an4 %>%
  group_by(Individuals_ID_well)

dat_tri <- dat_tri %>%
  mutate(Trigram = str_c(Turn, lead(Turn), lead(Turn,2))) %>%
  ungroup() %>%
  select(Individuals_ID_well,Sequence, Turn, Trigram)
dat_tri
## # A tibble: 2,745 × 4
##    Individuals_ID_well Sequence Turn  Trigram
##    <chr>                  <dbl> <chr> <chr>  
##  1 ID_210421_1_A2             1 L     LRL    
##  2 ID_210421_1_A2             1 R     RLL    
##  3 ID_210421_1_A2             1 L     LLR    
##  4 ID_210421_1_A2             1 L     LRL    
##  5 ID_210421_1_A2             1 R     RLL    
##  6 ID_210421_1_A2             1 L     LLL    
##  7 ID_210421_1_A2             1 L     LLR    
##  8 ID_210421_1_A2             1 L     LRR    
##  9 ID_210421_1_A2             1 R     RRR    
## 10 ID_210421_1_A2             1 R     RRL    
## # … with 2,735 more rows

We can now summarise trigrams and turns:

all_trigrams <- unique(dat_tri$Trigram)

tri_long <- dat_tri %>%
  select(-Turn) %>%
  na.omit() %>%
  group_by(Individuals_ID_well) %>%
  table() %>%
  as_tibble() %>%
  arrange(Individuals_ID_well)

tri_wide <- tri_long %>% pivot_wider(names_from = Trigram, values_from = n)

turn_long = dat_tri %>%
  select(-Trigram, -Sequence) %>%
  na.omit() %>%
  group_by(Individuals_ID_well) %>%
  table() %>%
  as_tibble() %>%
  arrange(Individuals_ID_well)

turn_wide <- turn_long %>% pivot_wider(names_from = Turn, values_from = n)

data_ymaze <- tri_wide %>%
  left_join(turn_wide, by = 'Individuals_ID_well') %>%
  mutate(total_turns = L+R,
         reps = LLL + RRR,
         alter = RLR + LRL,
         partial = RRL + RLL + LRR + LLR,
         rel_reps = (reps*100)/total_turns,
         rel_alter = (alter*100)/total_turns,
         rel_R = (R*100)/total_turns,
         rel_L = (L*100)/total_turns,
         asymmetry = 1 - (R/L)) # 0 = symmetrical, <1 = 

data_ymaze
## # A tibble: 190 × 21
##    Individuals_ID_well Sequence   LLL   LLR   LRL   LRR   RLL   RLR   RRL   RRR
##    <chr>               <chr>    <int> <int> <int> <int> <int> <int> <int> <int>
##  1 ID_210421_1_A2      1            1     2     2     1     2     0     1     1
##  2 ID_210421_1_A2      2            0     0     0     0     0     0     0     0
##  3 ID_210421_1_B6      1            0     0     0     1     0     0     1     0
##  4 ID_210421_1_B6      2            0     0     0     0     0     0     0     0
##  5 ID_210421_1_D8      1            0     0     0     0     0     0     0     0
##  6 ID_210421_1_D8      2            0     1     1     0     0     0     0     0
##  7 ID_210421_1_F2      1            0     0     0     0     0     0     0     0
##  8 ID_210421_1_F2      2            0     0     0     0     0     1     0     0
##  9 ID_210421_2_B2      1            0     1     0     3     1     1     3     4
## 10 ID_210421_2_B2      2            0     0     0     0     0     0     0     0
## # … with 180 more rows, and 11 more variables: L <int>, R <int>,
## #   total_turns <int>, reps <int>, alter <int>, partial <int>, rel_reps <dbl>,
## #   rel_alter <dbl>, rel_R <dbl>, rel_L <dbl>, asymmetry <dbl>

Merge with sex meta-data

data_ymaze <- data_ymaze %>%
  left_join(sex, by = 'Individuals_ID_well')

Example visualisations

Distribution of trigrams

plot2.1 <- tri_long %>%
  # filter(n > 0) %>%
  ggplot(aes(x = Trigram, y = n)) +
  geom_bar(stat = "sum", fill = "slateblue2") +
  theme_classic() + labs(x = 'Trigram', y = 'Total count') +
  theme(legend.position = "none", text = element_text(size = 15))
plot2.1

Correlation of locomotor activity with alteration and repetition patterns. Patterns suggest a zero-inflated model may be warranted (not explored in this paper).

plot2.2 <- data_ymaze %>% left_join(data_locomotion_avg, by = 'Individuals_ID_well') %>%
  select(avg, Sex, rel_reps, rel_alter) %>%
  ggplot(aes(x = avg, y = rel_reps, colour = Sex)) +
  geom_point() + geom_smooth(method = 'lm') +
  scale_color_manual(values = c(col_accent1, col_accent2)) +
  theme_classic() + labs(x = 'Locomotion activity', y = 'Proportion of repetitions') +
  theme(text = element_text(size = 15))

plot2.3 <- data_ymaze %>% left_join(data_locomotion_avg, by = 'Individuals_ID_well') %>%
  select(avg, Sex, rel_reps, rel_alter) %>%
  ggplot(aes(x = avg, y = rel_alter, colour = Sex)) +
  geom_point() + geom_smooth(method = 'lm') +
  scale_color_manual(values = c(col_accent1, col_accent2)) +
  theme_classic() + labs(x = 'Locomotion activity', y = 'Proportion of alternations') +
  theme(text = element_text(size = 15))

plot2.3.1 <- data_ymaze %>% filter(alter < 100) %>%
  left_join(data_locomotion_avg, by = 'Individuals_ID_well') %>%
  select(avg, Sex, alter) %>%
  ggplot(aes(x = avg, y = alter, colour = Sex)) +
  geom_point() + geom_smooth(method = 'lm') +
  scale_color_manual(values = c(col_accent1, col_accent2)) +
  theme_classic() + labs(x = 'Locomotion activity', y = 'Count of alternations') +
  theme(text = element_text(size = 15))

plot2.2
## `geom_smooth()` using formula 'y ~ x'

plot2.3
## `geom_smooth()` using formula 'y ~ x'

plot2.3.1
## `geom_smooth()` using formula 'y ~ x'

General variation in repetition and alteration behaviour.

plot2.4 <- data_ymaze %>% pivot_longer(cols = c('rel_reps', 'rel_alter'), names_to = 'Type') %>%
  ggplot(aes(x = value, fill = Type)) +
  geom_histogram() +
  theme_classic() + theme(text = element_text(size = 15)) +
  scale_fill_manual(values = c(col_accent1, col_accent2), labels = c("Alterations", "Repetitions")) +
  labs(x = "Proportion of repetitions", fill = "Behaviour", y = 'Count')
plot2.4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# plot_ymaze1 <- ggarrange(plot2.1, plot2.3.1)
# plot_ymaze1
# ggsave(plot_ymaze1, filename = './R/visualisations/Fig2.pdf', scale = 0.8)
# 
# plot_ymaze2 <- ggarrange(plot2.2, plot2.3)
# plot_ymaze2
# ggsave(plot_ymaze2, filename = './R/visualisations/FigS3.pdf', scale = 0.8)
# 
# ggsave(plot2.4, filename = './R/visualisations/FigS2.pdf', scale = 0.8)

Sex differences in Y-maze behaviour

plot2.5 <- data_ymaze %>% 
  ggviolin(., x = "Sex", y = "rel_reps", add = c("jitter", "mean_se"),
           error.plot = "crossbar", fill = col_accent2, lwd = 0) +
  labs(y = "Proportion of repetitions") +
  theme_classic() + theme(text = element_text(size = 15))
plot2.5

plot2.6 <- data_ymaze %>% 
  ggviolin(., x = "Sex", y = "rel_alter", add = c("jitter", "mean_se"),
           error.plot = "crossbar", fill = col_accent1, lwd = 0) +
  labs(y = "Proportion of alterations") +
  theme_classic() + theme(text = element_text(size = 15))
plot2.6

plot2.7 <- data_ymaze %>% 
  ggviolin(., x = "Sex", y = "asymmetry", add = c("jitter", "mean_se"),
           error.plot = "crossbar", fill = col_detail1, lwd = 0) +
  labs(y = "Handedness (turning asymmetry)") +
  theme_classic() + theme(text = element_text(size = 15))
plot2.7
## Warning: Removed 4 rows containing non-finite values (stat_ydensity).
## Warning: Removed 4 rows containing non-finite values (stat_summary).
## Warning: Removed 4 rows containing missing values (geom_point).

plot2.5s <- data_ymaze %>% 
  ggviolin(., x = "Sequence", y = "rel_reps", add = c("jitter", "mean_se"),
           error.plot = "crossbar", fill = col_accent2, lwd = 0) +
  labs(y = "Proportion of repetitions") +
  theme_classic() + theme(text = element_text(size = 15))
plot2.5s

plot2.6s <- data_ymaze %>% 
  ggviolin(., x = "Sequence", y = "rel_alter", add = c("jitter", "mean_se"),
           error.plot = "crossbar", fill = col_accent1, lwd = 0) +
  labs(y = "Proportion of alterations") +
  theme_classic() + theme(text = element_text(size = 15))
plot2.6s

plot2.7s <- data_ymaze %>% 
  ggviolin(., x = "Sequence", y = "asymmetry", add = c("jitter", "mean_se"),
           error.plot = "crossbar", fill = col_detail1, lwd = 0) +
  labs(y = "Handedness (turning asymmetry)") +
  theme_classic() + theme(text = element_text(size = 15))
plot2.7s
## Warning: Removed 4 rows containing non-finite values (stat_ydensity).
## Warning: Removed 4 rows containing non-finite values (stat_summary).
## Warning: Removed 4 rows containing missing values (geom_point).

# plot_ymaze_sex <- ggarrange(plot2.5, plot2.6, plot2.7, nrow = 1)
# plot_ymaze_sex
# ggsave(plot_ymaze_sex, filename = "./R/visualisations/FigS4.pdf")

Example data analysis

Formal analysis if alteration behaviour depends on overall locomotion activity of flies and if it interacts with sex.

# model: variance between rounds, individuals and batches, fixed effect of sex and assay date
model3 <- glm(alter ~ Sex * avg,
             data = data_ymaze %>% left_join(data_locomotion_avg, by = 'Individuals_ID_well'),
             family = 'poisson')
summary(model3)
## 
## Call:
## glm(formula = alter ~ Sex * avg, family = "poisson", data = data_ymaze %>% 
##     left_join(data_locomotion_avg, by = "Individuals_ID_well"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -13.934   -2.784   -1.691   -0.956   63.897  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.3993144  0.0819107  -4.875 1.09e-06 ***
## SexM         1.6003805  0.0844289  18.955  < 2e-16 ***
## avg          0.0057658  0.0004350  13.256  < 2e-16 ***
## SexM:avg    -0.0029701  0.0004362  -6.809 9.81e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 34518  on 949  degrees of freedom
## Residual deviance: 27999  on 946  degrees of freedom
## AIC: 29156
## 
## Number of Fisher Scoring iterations: 12
data_ymaze %>% left_join(data_locomotion_avg, by = 'Individuals_ID_well') %>%
  ggplot(aes(x = avg, y = alter, colour = Sex)) +
  geom_point() + geom_smooth(method = 'lm') +
  scale_color_manual(values = c(col_accent1, col_accent2)) +
  theme_classic() + labs(x = 'Locomotion activity', y = 'Number of alterations')
## `geom_smooth()` using formula 'y ~ x'

# there is one clear outlier that can be an artifact


model4 <- glm(alter ~ Sex * avg,
             data = data_ymaze %>% left_join(data_locomotion_avg, by = 'Individuals_ID_well') %>% filter(alter < 100),
             family = 'poisson')
summary(model4)
## 
## Call:
## glm(formula = alter ~ Sex * avg, family = "poisson", data = data_ymaze %>% 
##     left_join(data_locomotion_avg, by = "Individuals_ID_well") %>% 
##     filter(alter < 100))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2703  -1.7490  -1.2557   0.3544   6.7539  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.3993144  0.0819104  -4.875 1.09e-06 ***
## SexM         0.7909469  0.0891618   8.871  < 2e-16 ***
## avg          0.0057658  0.0004350  13.256  < 2e-16 ***
## SexM:avg    -0.0048293  0.0004477 -10.788  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3748.0  on 944  degrees of freedom
## Residual deviance: 3495.2  on 941  degrees of freedom
## AIC: 4611
## 
## Number of Fisher Scoring iterations: 6
model5 <- glm(alter ~ Sex + as.factor(Sequence),
             data = data_ymaze,
             family = 'poisson')
summary(model5)
## 
## Call:
## glm(formula = alter ~ Sex + as.factor(Sequence), family = "poisson", 
##     data = data_ymaze)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.770  -3.986  -2.027  -0.707  66.321  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.10775    0.15521  -7.137 9.54e-13 ***
## SexM                  1.82728    0.13993  13.058  < 2e-16 ***
## as.factor(Sequence)2  1.71190    0.08813  19.425  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 6903.6  on 189  degrees of freedom
## Residual deviance: 6077.2  on 187  degrees of freedom
## AIC: 6313.2
## 
## Number of Fisher Scoring iterations: 8
model6 <- glm(reps ~ Sex + as.factor(Sequence),
             data = data_ymaze,
             family = 'poisson')
summary(model6)
## 
## Call:
## glm(formula = reps ~ Sex + as.factor(Sequence), family = "poisson", 
##     data = data_ymaze)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.670  -3.670  -2.816  -1.481  24.523  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           0.09254    0.12481   0.742    0.458    
## SexM                  1.28505    0.12185  10.546  < 2e-16 ***
## as.factor(Sequence)2  0.52966    0.07217   7.339 2.16e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3346.2  on 189  degrees of freedom
## Residual deviance: 3133.0  on 187  degrees of freedom
## AIC: 3336.6
## 
## Number of Fisher Scoring iterations: 7

Light-off startle response assay

Analysis of habituation data based on Allen TA, Budenberg WJ (2021). Replicating Light-Off Startle Responses in Drosophila melanogaster. BiorXiv.

Data parsing

#specify directory of zantiks habituation files
data_path <- here("data", "habituation/") 

#create list of files from directory
file_list <- list.files(data_path)

#create header from first file
df <-
  paste0(data_path, file_list[1]) %>%
  read_csv(skip=4,col_names = TRUE, guess_max = 100) %>%
  head(0)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   UNIT = col_character(),
##   TIMESLOT = col_character(),
##   PLATE_ID = col_character(),
##   TYPE = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col    expected    actual                                                                                                             file
##  82  -- 107 columns 4 columns '/Users/szymek/Dropbox/#SCIENCE/R/220330__Dmel_methods/data/habituation/E_allen_habituation-20000228T233651.csv'
#create new list without demographic info
new_list<- c()

for (i in file_list){
  new_list[[i]] <-
    read_csv(paste0(data_path, i),
             skip=4, col_names = TRUE, guess_max = 100) %>%
    head(-1)
}
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   UNIT = col_character(),
##   TIMESLOT = col_character(),
##   PLATE_ID = col_character(),
##   TYPE = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col    expected    actual                                                                                                             file
##  82  -- 107 columns 4 columns '/Users/szymek/Dropbox/#SCIENCE/R/220330__Dmel_methods/data/habituation/E_allen_habituation-20000228T233651.csv'
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   UNIT = col_character(),
##   TIMESLOT = col_character(),
##   PLATE_ID = col_character(),
##   TYPE = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col    expected    actual                                                                                                             file
##  82  -- 107 columns 4 columns '/Users/szymek/Dropbox/#SCIENCE/R/220330__Dmel_methods/data/habituation/E_allen_habituation-20000229T013652.csv'
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   UNIT = col_character(),
##   TIMESLOT = col_character(),
##   PLATE_ID = col_character(),
##   TYPE = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col    expected    actual                                                                                                             file
##  82  -- 107 columns 4 columns '/Users/szymek/Dropbox/#SCIENCE/R/220330__Dmel_methods/data/habituation/E_allen_habituation-20000229T035615.csv'
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   UNIT = col_character(),
##   TIMESLOT = col_character(),
##   PLATE_ID = col_character(),
##   TYPE = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col    expected    actual                                                                                                             file
##  82  -- 107 columns 4 columns '/Users/szymek/Dropbox/#SCIENCE/R/220330__Dmel_methods/data/habituation/E_allen_habituation-20000229T052340.csv'
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   UNIT = col_character(),
##   TIMESLOT = col_character(),
##   PLATE_ID = col_character(),
##   TYPE = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col    expected    actual                                                                                                             file
##  82  -- 107 columns 4 columns '/Users/szymek/Dropbox/#SCIENCE/R/220330__Dmel_methods/data/habituation/E_allen_habituation-20000229T233253.csv'
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   UNIT = col_character(),
##   TIMESLOT = col_character(),
##   PLATE_ID = col_character(),
##   TYPE = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col    expected    actual                                                                                                             file
##  82  -- 107 columns 4 columns '/Users/szymek/Dropbox/#SCIENCE/R/220330__Dmel_methods/data/habituation/E_allen_habituation-20000301T010322.csv'
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   UNIT = col_character(),
##   TIMESLOT = col_character(),
##   PLATE_ID = col_character(),
##   TYPE = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col    expected    actual                                                                                                             file
##  82  -- 107 columns 4 columns '/Users/szymek/Dropbox/#SCIENCE/R/220330__Dmel_methods/data/habituation/E_allen_habituation-20000301T035328.csv'
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   UNIT = col_character(),
##   TIMESLOT = col_character(),
##   PLATE_ID = col_character(),
##   TYPE = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col    expected    actual                                                                                                             file
##  82  -- 107 columns 4 columns '/Users/szymek/Dropbox/#SCIENCE/R/220330__Dmel_methods/data/habituation/E_allen_habituation-20000301T051513.csv'
#append all files to df
for (i in new_list){
  df<-add_row(df,i)
}

Let’s select and rename relevanmt variables. In the course of it we also remove wells F6, F7, F8 (they are always empty in our design).

df <- df %>% select(!c(RUNTIME, UNIT, TIMESLOT, TEMPERATURE))
#convert variables to factors for anova
df<-as_factor(df,BLOCK)
df<-as_factor(df,TYPE)

df <- df %>% rename(Datafile_ID = PLATE_ID, time_bin = TIME_BIN,
                    Block = BLOCK, Trial = TRIAL, Type = TYPE,
                    pre_post_counter = PRE_POST_COUNTER,
                    startle_number = STARTLE_NUMBER)

df <- df %>% select(-c(F6:F8), -c(F6MSD:F8MSD))

Produce population-average distance data. Here we use the mean squared distance data and total distance travelled from each well to assemble a plate-level (hence - population) measure of fly behaviour during the trial. These produce two summary statistics of fly activity in arenas.

pop_data<-mutate(df,total_dist = rowSums(select(df,!ends_with("MSD"),
                                                -Datafile_ID, -time_bin, -Block,
                                                -Trial, -Type, -pre_post_counter,
                                                -startle_number)))
pop_data<-mutate(pop_data,total_act = rowSums(select(df,ends_with("MSD"))))

Raw habituation data can be now represented with well ID information.

dfile_dist <- df %>% 
  select(!ends_with("MSD")) %>%
  gather(key = "Well", value = "distance", -Datafile_ID,
         -time_bin, -Block, -Trial, -Type, -pre_post_counter,
         -startle_number) %>%
  convert_as_factor(Well)

# create file with well factor and MSD activity dv only
dfile_act<- df %>%
  select(ends_with("MSD")) %>%
  gather(key = "Well", value = "activity") %>%
  convert_as_factor(Well)

# remove duplicate well variable before adding activity data
dfile_act <- select(dfile_act, -'Well')

# add activity column to rest of data
df <- add_column(dfile_dist, dfile_act)

# remove acclimation data
no_acclimation <- df %>%
  filter(Type != "ACCLIMATION")

# create data file with only startles from first repeat (Block == 1)
startles_only <- filter(no_acclimation, Type == "STARTLE", Block == 1)
pop_startles_only<-filter(pop_data, Type == "STARTLE", Block == 1)

Merge with sex meta-data

We also add information about sex.

startles_only <- startles_only %>%
    left_join(select(run_register, Datafile_ID, Individuals_ID, Exp_block, Batch_ID))
## Joining, by = "Datafile_ID"
startles_only <- startles_only %>%
  mutate(Individuals_ID_well = paste0(Individuals_ID, "_", Well))

startles_only <- startles_only %>%
  left_join(sex, by = "Individuals_ID_well")

Additional data processing

Below, to facilitate following summaries, we aggregate the data by the light-off shock number.

pop_startles_block <-
  pop_startles_only %>%
  group_by(startle_number) %>%
  get_summary_stats(total_dist, type = "mean_sd")

We generate overall summary of distances traveled by individual flies during startles 1 through 3, in two variants: including and excluding flies that did not move at all during all three repeats of startling.

startles_only <- startles_only %>%
  group_by(Individuals_ID_well)

startles_only_act <- startles_only %>%
  filter(sum(distance) != 0) %>%
  ungroup()

overall_summary_act <- startles_only_act %>%
  group_by(startle_number) %>%
  get_summary_stats(distance, type = "mean_sd")

overall_summary <- startles_only %>%
  group_by(startle_number) %>%
  get_summary_stats(distance, type = "mean_sd")

Example visualisations

# pop habituation averaged across blocks graph
plot3.1 <- gg_pop_hab_block <- ggplot(data=pop_startles_block, aes(x = startle_number, y = mean,)) +
  xlab("Trial") +
  ylab("Average Population Distance Travelled (pixels)") +
  geom_errorbar(aes(ymin=mean-(sd/sqrt(2)), ymax=mean+(sd/sqrt(2))), width=1, color = col_accent1) +
  geom_line() +
  geom_point() + theme_classic() + theme(text = element_text(size = 15))
plot3.1

# habituation data by individual - only active flies
plot3.2 <- gg_habituation<-ggplot(data=startles_only_act, aes(x = startle_number, y = distance)) +
  facet_wrap(facets = ~ Individuals_ID_well) +
  geom_line() +
  geom_point(col = col_accent1) + theme_bw() +
  scale_x_continuous(breaks = c(1,2,3)) +
  scale_y_continuous(breaks = c(10, 30)) +
  theme(strip.text.x = element_blank(), text = element_text(size = 15)) +
  labs(x = 'Startle number', y = 'Total distance travelled')
plot3.2

# ggsave(plot3.1, filename = './R/visualisations/FigS5.pdf', scale = 0.8)
# ggsave(plot3.2, filename = './R/visualisations/Fig3.pdf')

Example analysis

model5 <- lme(distance ~ Sex*startle_number,
              random = list(~1|Exp_block, ~1|Individuals_ID_well),
               data = startles_only_act)
summary(model5)
## Linear mixed-effects model fit by REML
##   Data: startles_only_act 
##        AIC      BIC    logLik
##   1960.017 1986.461 -973.0087
## 
## Random effects:
##  Formula: ~1 | Exp_block
##         (Intercept)
## StdDev:   0.2900206
## 
##  Formula: ~1 | Individuals_ID_well %in% Exp_block
##         (Intercept) Residual
## StdDev:    1.072292 4.653713
## 
## Fixed effects:  distance ~ Sex * startle_number 
##                         Value Std.Error  DF   t-value p-value
## (Intercept)          3.308081 0.9322378 216  3.548537  0.0005
## SexM                 0.886467 1.3890260 104  0.638193  0.5248
## startle_number       0.313934 0.4213274 216  0.745108  0.4570
## SexM:startle_number -0.810809 0.6349102 216 -1.277046  0.2030
##  Correlation: 
##                     (Intr) SexM   strtl_
## SexM                -0.655              
## startle_number      -0.904  0.607       
## SexM:startle_number  0.600 -0.914 -0.664
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -1.0333550 -0.7152464 -0.3681651  0.5110548  6.4093556 
## 
## Number of Observations: 327
## Number of Groups: 
##                          Exp_block Individuals_ID_well %in% Exp_block 
##                                  4                                109
model6 <- lme(distance ~ Sex*startle_number,
               random = list(~1|Exp_block, ~1+startle_number|Individuals_ID_well),
               data = startles_only_act)
summary(model6)
## Linear mixed-effects model fit by REML
##   Data: startles_only_act 
##        AIC      BIC    logLik
##   1953.128 1987.127 -967.5639
## 
## Random effects:
##  Formula: ~1 | Exp_block
##         (Intercept)
## StdDev:   0.2994339
## 
##  Formula: ~1 + startle_number | Individuals_ID_well %in% Exp_block
##  Structure: General positive-definite, Log-Cholesky parametrization
##                StdDev   Corr  
## (Intercept)    5.281135 (Intr)
## startle_number 2.578841 -0.939
## Residual       3.881778       
## 
## Fixed effects:  distance ~ Sex * startle_number 
##                         Value Std.Error  DF   t-value p-value
## (Intercept)          3.308016 1.0280227 216  3.217844  0.0015
## SexM                 0.890470 1.5338910 104  0.580530  0.5628
## startle_number       0.313934 0.4822169 216  0.651023  0.5157
## SexM:startle_number -0.810809 0.7266663 216 -1.115793  0.2658
##  Correlation: 
##                     (Intr) SexM   strtl_
## SexM                -0.656              
## startle_number      -0.921  0.617       
## SexM:startle_number  0.611 -0.930 -0.664
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8217871 -0.5456028 -0.2667349  0.4846208  5.2726074 
## 
## Number of Observations: 327
## Number of Groups: 
##                          Exp_block Individuals_ID_well %in% Exp_block 
##                                  4                                109
## comparison of intercept-only and random slopes models
anova(model5, model6)
##        Model df      AIC      BIC    logLik   Test  L.Ratio p-value
## model5     1  7 1960.017 1986.461 -973.0087                        
## model6     2  9 1953.128 1987.127 -967.5639 1 vs 2 10.88969  0.0043

Revision: multi-variate data processing

Merge data across individuals

data_multivar <- left_join(data_ymaze, data_locomotion_avg,
                           by = 'Individuals_ID_well') %>%
  select(Individuals_ID_well, avg, reps, alter, partial,
         rel_reps, rel_alter, rel_R, rel_L, asymmetry, Sex) %>%
  rename(locomotion = avg) %>%
  as.data.frame()
library(MCMCglmm)
## Loading required package: coda
## Loading required package: ape
## 
## Attaching package: 'ape'
## The following object is masked from 'package:ggpubr':
## 
##     rotate
model <- MCMCglmm(cbind(log(locomotion + 1), rel_alter, rel_reps, asymmetry) ~ -1 + trait * Sex,
                  rcov = ~ us(trait):units,
                  data = data_multivar[!is.infinite(data_multivar$asymmetry), ],
                  family = rep('gaussian', 4),
                  prior = list(R = list(V = diag(4), nu=3.002)),
                  nitt = 100000, burnin = 20000, thin = 80, verbose = F)
plot(model)

summary(model)
## 
##  Iterations = 20001:99921
##  Thinning interval  = 80
##  Sample size  = 1000 
## 
##  DIC: 22708.22 
## 
##  R-structure:  ~us(trait):units
## 
##                                       post.mean l-95% CI u-95% CI eff.samp
## traitlocomotion:traitlocomotion.units    2.0724   1.8949   2.2787   1000.0
## traitrel_alter:traitlocomotion.units    -1.9131  -3.6155  -0.2822    892.7
## traitrel_reps:traitlocomotion.units      3.4474   1.6777   5.0844   1000.0
## traitasymmetry:traitlocomotion.units    -0.2848  -0.4157  -0.1753   1000.0
## traitlocomotion:traitrel_alter.units    -1.9131  -3.6155  -0.2822    892.7
## traitrel_alter:traitrel_alter.units    348.0216 319.2221 379.8938    896.1
## traitrel_reps:traitrel_alter.units     -13.0926 -37.9529  10.7844   1000.0
## traitasymmetry:traitrel_alter.units      2.5751   1.1933   4.4039    774.6
## traitlocomotion:traitrel_reps.units      3.4474   1.6777   5.0844   1000.0
## traitrel_alter:traitrel_reps.units     -13.0926 -37.9529  10.7844   1000.0
## traitrel_reps:traitrel_reps.units      377.7743 344.4336 413.4602   1000.0
## traitasymmetry:traitrel_reps.units      -4.7534  -6.3033  -3.0277   1000.0
## traitlocomotion:traitasymmetry.units    -0.2848  -0.4157  -0.1753   1000.0
## traitrel_alter:traitasymmetry.units      2.5751   1.1933   4.4039    774.6
## traitrel_reps:traitasymmetry.units      -4.7534  -6.3033  -3.0277   1000.0
## traitasymmetry:traitasymmetry.units      1.8456   1.6921   2.0048   1000.0
## 
##  Location effects: cbind(log(locomotion + 1), rel_alter, rel_reps, asymmetry) ~ -1 + trait * Sex 
## 
##                     post.mean l-95% CI u-95% CI eff.samp  pMCMC    
## traitlocomotion       3.10334  2.91359  3.27740    923.8 <0.001 ***
## traitrel_alter       13.80979 11.55302 16.31253   1000.0 <0.001 ***
## traitrel_reps         5.83745  3.33046  8.18500   1000.0 <0.001 ***
## traitasymmetry       -0.09054 -0.26238  0.06946   1000.0   0.31    
## SexM                  0.88549  0.67200  1.09693    887.8 <0.001 ***
## traitrel_alter:SexM  -3.63499 -6.12849 -0.47774   1000.0   0.01 ** 
## traitrel_reps:SexM    5.06960  2.40411  8.00891   1000.0 <0.001 ***
## traitasymmetry:SexM  -1.19546 -1.51417 -0.86264   1000.0 <0.001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
matrix(colMeans(posterior.cor(model$VCV)), 4, 4)
##             [,1]        [,2]        [,3]       [,4]
## [1,]  1.00000000 -0.07125421  0.12318765 -0.1455909
## [2,] -0.07125421  1.00000000 -0.03611113  0.1015474
## [3,]  0.12318765 -0.03611113  1.00000000 -0.1799740
## [4,] -0.14559088  0.10154737 -0.17997398  1.0000000
# plot correlations
hist(data_multivar$locomotion)

hist(data_multivar$rel_reps)

hist(data_multivar$rel_alter)

hist(data_multivar$asymmetry)

pairs(~log(locomotion+1)+log(rel_alter+1)+log(rel_reps+1)+asymmetry, data = data_multivar)